{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ff509a29",
   "metadata": {},
   "source": [
    "用户贷款违约预测：https://www.heywhale.com/home/competition/615ff7bdc270e400182b249e/content/1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a78c6e37",
   "metadata": {},
   "source": [
    "# 导入模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dfdde11b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T13:30:56.028286Z",
     "start_time": "2021-10-16T13:30:56.023286Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 0 ns\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import pandas as pd\n",
    "from toad import detect\n",
    "\n",
    "pd.set_option('display.width', 180)\n",
    "pd.set_option('display.max_rows', None)\n",
    "pd.set_option('display.max_columns', 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13ebb22b",
   "metadata": {},
   "source": [
    "# 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c117be1c",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T12:59:49.002499Z",
     "start_time": "2021-10-16T12:59:48.509470Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 488 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train = pd.read_csv(\"../data/train.csv\")\n",
    "test  = pd.read_csv(\"../data/test.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ba0858b",
   "metadata": {},
   "source": [
    "# 数据探索"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb311528",
   "metadata": {},
   "source": [
    "## 查看前五行"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "084eec56",
   "metadata": {},
   "source": [
    "### 训练集查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1d6438da",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T12:59:54.239798Z",
     "start_time": "2021-10-16T12:59:54.211796Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>income</th>\n",
       "      <th>age</th>\n",
       "      <th>experience_years</th>\n",
       "      <th>is_married</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "      <th>current_job_years</th>\n",
       "      <th>current_house_years</th>\n",
       "      <th>house_ownership</th>\n",
       "      <th>car_ownership</th>\n",
       "      <th>profession</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>train_0</td>\n",
       "      <td>8529345</td>\n",
       "      <td>44</td>\n",
       "      <td>2</td>\n",
       "      <td>single</td>\n",
       "      <td>210</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>train_1</td>\n",
       "      <td>7848654</td>\n",
       "      <td>55</td>\n",
       "      <td>9</td>\n",
       "      <td>single</td>\n",
       "      <td>229</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>13</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>43</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>train_2</td>\n",
       "      <td>8491491</td>\n",
       "      <td>61</td>\n",
       "      <td>20</td>\n",
       "      <td>single</td>\n",
       "      <td>114</td>\n",
       "      <td>28</td>\n",
       "      <td>8</td>\n",
       "      <td>11</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>train_3</td>\n",
       "      <td>8631544</td>\n",
       "      <td>69</td>\n",
       "      <td>13</td>\n",
       "      <td>married</td>\n",
       "      <td>276</td>\n",
       "      <td>14</td>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>27</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>train_4</td>\n",
       "      <td>6947233</td>\n",
       "      <td>62</td>\n",
       "      <td>10</td>\n",
       "      <td>single</td>\n",
       "      <td>56</td>\n",
       "      <td>11</td>\n",
       "      <td>10</td>\n",
       "      <td>12</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>47</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id   income  age  experience_years is_married  city  region  current_job_years  current_house_years house_ownership car_ownership  profession  label\n",
       "0  train_0  8529345   44                 2     single   210       0                  2                   10          rented            no          13      0\n",
       "1  train_1  7848654   55                 9     single   229       2                  9                   13          rented            no          43      0\n",
       "2  train_2  8491491   61                20     single   114      28                  8                   11          rented            no          12      0\n",
       "3  train_3  8631544   69                13    married   276      14                 13                   12          rented            no          27      0\n",
       "4  train_4  6947233   62                10     single    56      11                 10                   12          rented            no          47      0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "972d1e0a",
   "metadata": {},
   "source": [
    "### 测试集查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9892f99b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T13:30:41.313445Z",
     "start_time": "2021-10-16T13:30:41.297444Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1e+03 µs\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>income</th>\n",
       "      <th>age</th>\n",
       "      <th>experience_years</th>\n",
       "      <th>is_married</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "      <th>current_job_years</th>\n",
       "      <th>current_house_years</th>\n",
       "      <th>house_ownership</th>\n",
       "      <th>car_ownership</th>\n",
       "      <th>profession</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>test_0</td>\n",
       "      <td>4260981</td>\n",
       "      <td>47</td>\n",
       "      <td>18</td>\n",
       "      <td>single</td>\n",
       "      <td>118</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>13</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>test_1</td>\n",
       "      <td>1537266</td>\n",
       "      <td>78</td>\n",
       "      <td>13</td>\n",
       "      <td>single</td>\n",
       "      <td>73</td>\n",
       "      <td>22</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>test_2</td>\n",
       "      <td>4716686</td>\n",
       "      <td>74</td>\n",
       "      <td>11</td>\n",
       "      <td>single</td>\n",
       "      <td>96</td>\n",
       "      <td>25</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>norent_noown</td>\n",
       "      <td>no</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>test_3</td>\n",
       "      <td>4245860</td>\n",
       "      <td>34</td>\n",
       "      <td>3</td>\n",
       "      <td>single</td>\n",
       "      <td>232</td>\n",
       "      <td>14</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>test_4</td>\n",
       "      <td>493379</td>\n",
       "      <td>66</td>\n",
       "      <td>6</td>\n",
       "      <td>single</td>\n",
       "      <td>145</td>\n",
       "      <td>28</td>\n",
       "      <td>6</td>\n",
       "      <td>11</td>\n",
       "      <td>rented</td>\n",
       "      <td>yes</td>\n",
       "      <td>28</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id   income  age  experience_years is_married  city  region  current_job_years  current_house_years house_ownership car_ownership  profession  label\n",
       "0  test_0  4260981   47                18     single   118       0                  7                   13          rented            no          39      0\n",
       "1  test_1  1537266   78                13     single    73      22                  9                   12          rented            no          50      0\n",
       "2  test_2  4716686   74                11     single    96      25                  9                   10    norent_noown            no           1      0\n",
       "3  test_3  4245860   34                 3     single   232      14                  3                   11          rented            no           6      0\n",
       "4  test_4   493379   66                 6     single   145      28                  6                   11          rented           yes          28      0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f8f53ba",
   "metadata": {},
   "source": [
    "## 查看数据类型、缺失值和枚举值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cae7f0c1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T13:50:51.768679Z",
     "start_time": "2021-10-16T13:50:51.000635Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 756 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>size</th>\n",
       "      <th>missing</th>\n",
       "      <th>unique</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <td>object</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>168000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>income</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>30068</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>experience_years</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>is_married</th>\n",
       "      <td>object</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>city</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>current_job_years</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>current_house_years</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>house_ownership</th>\n",
       "      <td>object</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>car_ownership</th>\n",
       "      <td>object</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>profession</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>label</th>\n",
       "      <td>int64</td>\n",
       "      <td>168000</td>\n",
       "      <td>0.00%</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       type    size missing  unique\n",
       "id                   object  168000   0.00%  168000\n",
       "income                int64  168000   0.00%   30068\n",
       "age                   int64  168000   0.00%      59\n",
       "experience_years      int64  168000   0.00%      21\n",
       "is_married           object  168000   0.00%       2\n",
       "city                  int64  168000   0.00%     317\n",
       "region                int64  168000   0.00%      29\n",
       "current_job_years     int64  168000   0.00%      15\n",
       "current_house_years   int64  168000   0.00%       5\n",
       "house_ownership      object  168000   0.00%       3\n",
       "car_ownership        object  168000   0.00%       2\n",
       "profession            int64  168000   0.00%      51\n",
       "label                 int64  168000   0.00%       2"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "detect(train).iloc[:, :4]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d358bad0",
   "metadata": {},
   "source": [
    "## 描述性统计"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5165a3a3",
   "metadata": {},
   "source": [
    "### 训练集查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d5ae6627",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T14:21:45.051680Z",
     "start_time": "2021-10-16T14:21:44.955675Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 76 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>income</th>\n",
       "      <th>age</th>\n",
       "      <th>experience_years</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "      <th>current_job_years</th>\n",
       "      <th>current_house_years</th>\n",
       "      <th>profession</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.680000e+05</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "      <td>168000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.994944e+06</td>\n",
       "      <td>49.961577</td>\n",
       "      <td>10.088887</td>\n",
       "      <td>157.930446</td>\n",
       "      <td>13.801554</td>\n",
       "      <td>6.339571</td>\n",
       "      <td>11.997673</td>\n",
       "      <td>25.251054</td>\n",
       "      <td>0.123065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.879353e+06</td>\n",
       "      <td>17.053195</td>\n",
       "      <td>5.998594</td>\n",
       "      <td>92.123165</td>\n",
       "      <td>9.379915</td>\n",
       "      <td>3.647073</td>\n",
       "      <td>1.399613</td>\n",
       "      <td>14.722342</td>\n",
       "      <td>0.328513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.031000e+04</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2.499018e+06</td>\n",
       "      <td>35.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>78.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.994848e+06</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>157.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>25.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>7.475446e+06</td>\n",
       "      <td>65.000000</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>238.000000</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>9.999938e+06</td>\n",
       "      <td>79.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>316.000000</td>\n",
       "      <td>28.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             income            age  experience_years           city         region  current_job_years  current_house_years     profession          label\n",
       "count  1.680000e+05  168000.000000     168000.000000  168000.000000  168000.000000      168000.000000        168000.000000  168000.000000  168000.000000\n",
       "mean   4.994944e+06      49.961577         10.088887     157.930446      13.801554           6.339571            11.997673      25.251054       0.123065\n",
       "std    2.879353e+06      17.053195          5.998594      92.123165       9.379915           3.647073             1.399613      14.722342       0.328513\n",
       "min    1.031000e+04      21.000000          0.000000       0.000000       0.000000           0.000000            10.000000       0.000000       0.000000\n",
       "25%    2.499018e+06      35.000000          5.000000      78.000000       6.000000           4.000000            11.000000      13.000000       0.000000\n",
       "50%    4.994848e+06      50.000000         10.000000     157.000000      14.000000           6.000000            12.000000      25.000000       0.000000\n",
       "75%    7.475446e+06      65.000000         15.000000     238.000000      22.000000           9.000000            13.000000      38.000000       0.000000\n",
       "max    9.999938e+06      79.000000         20.000000     316.000000      28.000000          14.000000            14.000000      50.000000       1.000000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ce4d14f",
   "metadata": {},
   "source": [
    "### 测试集查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4b9851f0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T14:22:15.707434Z",
     "start_time": "2021-10-16T14:22:15.691433Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 0 ns\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>income</th>\n",
       "      <th>age</th>\n",
       "      <th>experience_years</th>\n",
       "      <th>is_married</th>\n",
       "      <th>city</th>\n",
       "      <th>region</th>\n",
       "      <th>current_job_years</th>\n",
       "      <th>current_house_years</th>\n",
       "      <th>house_ownership</th>\n",
       "      <th>car_ownership</th>\n",
       "      <th>profession</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>test_0</td>\n",
       "      <td>4260981</td>\n",
       "      <td>47</td>\n",
       "      <td>18</td>\n",
       "      <td>single</td>\n",
       "      <td>118</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>13</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>test_1</td>\n",
       "      <td>1537266</td>\n",
       "      <td>78</td>\n",
       "      <td>13</td>\n",
       "      <td>single</td>\n",
       "      <td>73</td>\n",
       "      <td>22</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>test_2</td>\n",
       "      <td>4716686</td>\n",
       "      <td>74</td>\n",
       "      <td>11</td>\n",
       "      <td>single</td>\n",
       "      <td>96</td>\n",
       "      <td>25</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>norent_noown</td>\n",
       "      <td>no</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>test_3</td>\n",
       "      <td>4245860</td>\n",
       "      <td>34</td>\n",
       "      <td>3</td>\n",
       "      <td>single</td>\n",
       "      <td>232</td>\n",
       "      <td>14</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>rented</td>\n",
       "      <td>no</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>test_4</td>\n",
       "      <td>493379</td>\n",
       "      <td>66</td>\n",
       "      <td>6</td>\n",
       "      <td>single</td>\n",
       "      <td>145</td>\n",
       "      <td>28</td>\n",
       "      <td>6</td>\n",
       "      <td>11</td>\n",
       "      <td>rented</td>\n",
       "      <td>yes</td>\n",
       "      <td>28</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id   income  age  experience_years is_married  city  region  current_job_years  current_house_years house_ownership car_ownership  profession  label\n",
       "0  test_0  4260981   47                18     single   118       0                  7                   13          rented            no          39      0\n",
       "1  test_1  1537266   78                13     single    73      22                  9                   12          rented            no          50      0\n",
       "2  test_2  4716686   74                11     single    96      25                  9                   10    norent_noown            no           1      0\n",
       "3  test_3  4245860   34                 3     single   232      14                  3                   11          rented            no           6      0\n",
       "4  test_4   493379   66                 6     single   145      28                  6                   11          rented           yes          28      0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc6ea7a9",
   "metadata": {},
   "source": [
    "## 特征探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "87f79dd3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T15:38:29.788056Z",
     "start_time": "2021-10-16T15:38:29.731053Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 51 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "8.98154761904762"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train[train[\"car_ownership\"]==\"no\"].groupby([\"label\"])[\"car_ownership\"].count()[1] / len(train) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "98255fd3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T15:38:25.387805Z",
     "start_time": "2021-10-16T15:38:25.346802Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.325"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "train[train[\"car_ownership\"]==\"yes\"].groupby([\"label\"])[\"car_ownership\"].count()[1] / len(train) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "df516f56",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-16T17:17:46.349545Z",
     "start_time": "2021-10-16T17:17:46.343545Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<p>lorem ipsum, John dolor sit amet</p>'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def p_decorate(func):\n",
    "    def func_wrapper(name):\n",
    "        return \"<p>{0}</p>\".format(func(name))\n",
    "    return func_wrapper\n",
    "\n",
    "@p_decorate\n",
    "def get_text(name):\n",
    "    return \"lorem ipsum, {0} dolor sit amet\".format(name)\n",
    "\n",
    "get_text(\"John\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "357c32c0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:lkm]",
   "language": "python",
   "name": "lkm"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "256px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
